In [288]:
# Regression. Numeric and Categorical Predictors. Dummy Variables and Interactions.
# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
# Set working directory and load data
os.chdir("C:\\Users\\baron\\Documents\\Teach\\627 Statistical Machine Learning\\Data") # Change the working directory
Auto = pd.read_csv("Auto.csv") # Read the data file in the CSV format
In [290]:
# Prepare data frames for X and Y, fit a linear regression model, and plot the regression line with data
OurDataFrame = pd.DataFrame(Auto)
Weight = OurDataFrame['weight']
MPG = OurDataFrame['mpg']
X = sm.add_constant(Weight)
reg = sm.OLS( MPG, X ).fit()
print(reg.summary())
plt.scatter(Weight, MPG, label='Data', s=15)
plt.plot(Weight, reg.predict(X), color='red', label='Regression Line')
plt.xlabel('Automobile weight'); plt.ylabel('Miles per gallon'); plt.title('Linear regression line');
plt.legend(); plt.show()
OLS Regression Results ============================================================================== Dep. Variable: mpg R-squared: 0.692 Model: OLS Adj. R-squared: 0.691 Method: Least Squares F-statistic: 886.6 Date: Wed, 31 Jul 2024 Prob (F-statistic): 5.37e-103 Time: 10:29:59 Log-Likelihood: -1146.0 No. Observations: 397 AIC: 2296. Df Residuals: 395 BIC: 2304. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const 46.3174 0.796 58.166 0.000 44.752 47.883 weight -0.0077 0.000 -29.776 0.000 -0.008 -0.007 ============================================================================== Omnibus: 40.133 Durbin-Watson: 0.797 Prob(Omnibus): 0.000 Jarque-Bera (JB): 56.057 Skew: 0.712 Prob(JB): 6.72e-13 Kurtosis: 4.166 Cond. No. 1.13e+04 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.13e+04. This might indicate that there are strong multicollinearity or other numerical problems.
In [292]:
# Is it the same linear relationship for American, Asian, and European cars?
# Map colors according to 'origin' and plot weight vs mpg colored by the origin
Auto['color'] = Auto['origin'].map({1: 'orange', 2: 'blue', 3: 'green'}) # Replace the color names with your desired colors
plt.scatter(Auto['weight'], Auto['mpg'], c=Auto['color'])
plt.xlabel('Weight'); plt.ylabel('MPG')
plt.title('Weight vs MPG colored by Continent')
plt.show()
In [294]:
# The line becomes less steep for larger cars.
In [320]:
# Fit a linear regression model with dummy variables, allowing different INTERCEPTS
DummyColumns = pd.get_dummies(Auto['origin'], dtype=int, drop_first=True) # Prepare dummy variables
Dummies = pd.get_dummies(DummyColumns, dtype=int)
Dummies = Dummies.rename(columns={2:'origin2', 3:'origin3'})
X = sm.add_constant(Dummies)
In [322]:
reg_dummies = sm.OLS(MPG, X).fit()
print(reg_dummies.summary())
OLS Regression Results ============================================================================== Dep. Variable: mpg R-squared: 0.333 Model: OLS Adj. R-squared: 0.330 Method: Least Squares F-statistic: 98.45 Date: Wed, 31 Jul 2024 Prob (F-statistic): 2.12e-35 Time: 10:40:18 Log-Likelihood: -1299.2 No. Observations: 397 AIC: 2604. Df Residuals: 394 BIC: 2616. Df Model: 2 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const 20.0718 0.407 49.339 0.000 19.272 20.872 origin2 7.8197 0.867 9.018 0.000 6.115 9.524 origin3 10.3789 0.828 12.540 0.000 8.752 12.006 ============================================================================== Omnibus: 25.088 Durbin-Watson: 0.753 Prob(Omnibus): 0.000 Jarque-Bera (JB): 28.611 Skew: 0.657 Prob(JB): 6.13e-07 Kurtosis: 3.020 Cond. No. 3.16 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [324]:
# Plot the resulting regression lines in different colors
plt.scatter(Weight, MPG, c=Auto['color'], s=20)
plt.scatter(Weight, reg_dummies.predict(X), c=Auto['color'])
plt.xlabel('Automobile weight'); plt.ylabel('Miles per gallon');
plt.title('Regression with different intercepts and no slopes');
plt.show()
In [342]:
# Include a common slope.
X1 = pd.concat([X, Weight], axis=1)
reg_int_oneslope = sm.OLS(MPG,X1).fit()
print(reg_int_oneslope.summary())
OLS Regression Results ============================================================================== Dep. Variable: mpg R-squared: 0.702 Model: OLS Adj. R-squared: 0.699 Method: Least Squares F-statistic: 307.9 Date: Wed, 31 Jul 2024 Prob (F-statistic): 8.82e-103 Time: 10:47:43 Log-Likelihood: -1139.6 No. Observations: 397 AIC: 2287. Df Residuals: 393 BIC: 2303. Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const 43.6896 1.107 39.481 0.000 41.514 45.865 origin2 1.2190 0.654 1.865 0.063 -0.066 2.504 origin3 2.3592 0.663 3.556 0.000 1.055 3.663 weight -0.0070 0.000 -22.021 0.000 -0.008 -0.006 ============================================================================== Omnibus: 37.597 Durbin-Watson: 0.813 Prob(Omnibus): 0.000 Jarque-Bera (JB): 54.086 Skew: 0.662 Prob(JB): 1.80e-12 Kurtosis: 4.232 Cond. No. 1.82e+04 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.82e+04. This might indicate that there are strong multicollinearity or other numerical problems.
In [346]:
# This regression will have a slightly better R^2 and adjusted R^2 than the original model.
# The grouping and the slope are significant. Plot:
plt.scatter(Weight, MPG, c=Auto['color'], s=20)
plt.scatter(Weight, reg_int_oneslope.predict(X1), c=Auto['color'])
plt.xlabel('Automobile weight'); plt.ylabel('Miles per gallon');
plt.title('Regression with different intercepts and one common slope');
plt.show()
In [352]:
# Fit a linear regression model with interaction terms, allowing different slopes for different continents
X1['origin2_weight'] = X1['origin2']*X1['weight']
X1['origin3_weight'] = X1['origin3']*X1['weight']
reg_interactions = sm.OLS(MPG,X1).fit()
reg_interactions.summary()
Out[352]:
Dep. Variable: | mpg | R-squared: | 0.706 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.703 |
Method: | Least Squares | F-statistic: | 188.1 |
Date: | Wed, 31 Jul 2024 | Prob (F-statistic): | 1.14e-101 |
Time: | 10:54:14 | Log-Likelihood: | -1136.4 |
No. Observations: | 397 | AIC: | 2285. |
Df Residuals: | 391 | BIC: | 2309. |
Df Model: | 5 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 42.9846 | 1.179 | 36.465 | 0.000 | 40.667 | 45.302 |
origin2 | 2.3912 | 2.847 | 0.840 | 0.401 | -3.206 | 7.988 |
origin3 | 11.2755 | 3.583 | 3.147 | 0.002 | 4.231 | 18.320 |
weight | -0.0068 | 0.000 | -19.973 | 0.000 | -0.007 | -0.006 |
origin2_weight | -0.0004 | 0.001 | -0.365 | 0.715 | -0.003 | 0.002 |
origin3_weight | -0.0039 | 0.002 | -2.527 | 0.012 | -0.007 | -0.001 |
Omnibus: | 42.084 | Durbin-Watson: | 0.819 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 61.346 |
Skew: | 0.720 | Prob(JB): | 4.78e-14 |
Kurtosis: | 4.278 | Cond. No. | 5.36e+04 |
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.36e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
In [354]:
# This regression will have a slightly better R^2 and adjusted R^2 than the original model.
# The grouping and the slope are significant. Plot:
plt.scatter(Weight, MPG, c=Auto['color'], s=20)
plt.scatter(Weight, reg_interactions.predict(X1), c=Auto['color'])
plt.xlabel('Automobile weight'); plt.ylabel('Miles per gallon');
plt.title('Regression with origin-weight interactions');
plt.show()